Load & Clean Data

set.seed(1337)
knitr::opts_chunk$set(fig.width = 10, fig.height = 8)

library(skimr)    
library(tidyverse)
library(ggplot2)
library(car)
library(e1071)
library(lubridate)
library(tidymodels)
library(caret)
library(ggpmisc)
library(ggrepel)
library(scales)
library(performance)
library(broom)
library(viridis)
library(ggridges)
library(lme4)
library(knitr)
library(kableExtra)
library(MuMIn)
library(Metrics)
library(sjPlot)
library(httpgd)
library(ggmap)
library(sf)
library(RColorBrewer)
library(broom.mixed)

source("/home/bengtegard/src/02_school_ec/06_r/ds24_r/kunskapskontroll/bengtegard_theme.R", encoding = "UTF-8")

n_cores <- parallel::detectCores()  
if(n_cores == 8) {                  
  n_cores <- 6                      
} else {
  n_cores <- 3                      
}

data <- read_csv("hemnet_sold_properties.csv")

data <- data |>
  rename(
    house_price = slutpris,
    listing_price = utgangspris,
    price_development = prisutv,
    number_of_rooms = antal_rum,
    living_area = boarea,
    secondary_area = biarea,
    plot_area = tomtarea,
    year_built = byggar,
    operating_cost = driftkostnad,
    sale_date = sälj_datum,
    neighborhood = område
  ) |>
  mutate(neighborhood = str_to_title(neighborhood))

# Convert sales dates for later analysis.
swedish_months <- c("januari" = "January", "februari" = "February", "mars" = "March", "april" = "April",
                    "maj" = "May", "juni" = "June", "juli" = "July", "augusti" = "August",
                    "september" = "September", "oktober" = "October", "november" = "November", "december" = "December")

data$sale_date <- str_replace_all(data$sale_date, swedish_months)
data$sale_date <- dmy(data$sale_date)

Lets take a quick look at our data

data
## # A tibble: 2,476 × 13
##    house_price listing_price price_development number_of_rooms living_area
##          <dbl>         <dbl>             <dbl>           <dbl>       <dbl>
##  1     5000000       5195000           -195000               5         134
##  2     5850000       5795000             55000               5         136
##  3    11000000      11195000           -195000               7         165
##  4     6750000       6995000           -245000               6         145
##  5     5910000       5995000            -85000               5         156
##  6     5700000       5995000           -295000               5         156
##  7     5850000       5495000            355000               4         126
##  8     4195000       4195000                 0               4          88
##  9     5195000       5195000                 0               6         151
## 10     6700000       6795000            -95000               7         192
## # ℹ 2,466 more rows
## # ℹ 8 more variables: secondary_area <dbl>, plot_area <dbl>, year_built <dbl>,
## #   operating_cost <dbl>, sale_date <date>, neighborhood <chr>, latitude <dbl>,
## #   longitude <dbl>

Descriptive statisticis summary and missing values

options(scipen = 999) 
skim(data)
Data summary
Name data
Number of rows 2476
Number of columns 13
_______________________
Column type frequency:
character 1
Date 1
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
neighborhood 1 1 2 15 0 90 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
sale_date 0 1 2019-09-07 2025-04-01 2022-04-25 1279

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
house_price 0 1.00 6026981.41 2592044.21 1000000.00 4400000.00 5470000.00 6902500.00 26500000.00 ▇▅▁▁▁
listing_price 6 1.00 5933505.76 2658245.95 1450000.00 4250000.00 5295000.00 6850000.00 27500000.00 ▇▂▁▁▁
price_development 6 1.00 95843.62 583096.05 -5000000.00 -195000.00 55000.00 405000.00 5850000.00 ▁▁▇▁▁
number_of_rooms 34 0.99 5.28 1.35 1.00 4.00 5.00 6.00 16.00 ▃▇▁▁▁
living_area 34 0.99 142.48 43.84 32.00 115.00 136.00 161.00 528.00 ▇▇▁▁▁
secondary_area 759 0.69 57.07 44.80 1.00 21.00 50.00 83.00 600.00 ▇▁▁▁▁
plot_area 12 1.00 651.52 340.34 59.00 441.75 605.50 800.00 5624.00 ▇▁▁▁▁
year_built 29 0.99 1963.54 33.86 1800.00 1936.00 1967.00 1989.00 2023.00 ▁▁▃▇▆
operating_cost 71 0.97 39516.26 13070.44 1000.00 31500.00 38130.00 45800.00 210705.00 ▇▃▁▁▁
latitude 0 1.00 55.61 6.09 -23.23 55.55 55.58 55.60 67.16 ▁▁▁▁▇
longitude 0 1.00 13.39 1.40 11.83 12.95 13.01 13.08 24.92 ▇▁▁▁▁

From this it becomes apperent that there is missing values in several columns. Most important is the variable secondary_area which has 759 missing values. The reason for this might be because it is less likely for a house to have a glazed veranda/porch or an unfinished attic. The definition for secondary area is: a space that is arranged in such a way that its usage is limited during certain parts of the year.

Since the secondary_area may be a significant predictor in modeling the final house price, we will create an indicator variable to account for its missing values (NA will be replaced with 0). This will allow the regression model to assess how the presence or absence of data for secondary_area influences the house price.

Additionally, other columns with missing values, such as number_of_rooms (with 34 missing values) and operating_cost (with 71 missing values), will be handled appropriately. Since these variables are essential for accurate modeling, rows with missing values in these columns will be dropped to maintain consistency and avoid skewing the results. This approach ensures that the analysis remains reliable, while minimizing the potential negative impact of incomplete data.

Add indicator variable and remove missing values

data_clean <- data |>
  mutate(
    secondary_area_missing = ifelse(is.na(secondary_area), 1, 0),
    secondary_area = ifelse(is.na(secondary_area), 0, secondary_area),
    sale_date = as.Date(sale_date),
    month = month(sale_date),
    year = year(sale_date),
    neighborhood = as.character(neighborhood)
  ) |>
  filter(
    !is.na(number_of_rooms),
    !is.na(operating_cost),
    !is.na(price_development),
    !is.na(neighborhood),
    !is.na(living_area),
    !is.na(plot_area),
    !is.na(year_built),
    !neighborhood %in% c("Malmö", "Torg", "Tomt", "Trädgård") # remove neighborhoods that is incomplete
  ) |>
  mutate(
    neighborhood = case_when(
      neighborhood == "By" ~ "Kyrkby",
      neighborhood == "Kyrby" ~ "Kyrkby",
      TRUE ~ neighborhood
    ),
    neighborhood = as.factor(neighborhood)
  )

# Check how many rows that were dropped
n_rows_drop <- nrow(data) - nrow(data_clean)
cat("Number of dropped rows:", n_rows_drop)
## Number of dropped rows: 182
# Check the tidy dataset to see e.g. the new indicator variable
head(data_clean)
## # A tibble: 6 × 16
##   house_price listing_price price_development number_of_rooms living_area
##         <dbl>         <dbl>             <dbl>           <dbl>       <dbl>
## 1     5000000       5195000           -195000               5         134
## 2     5850000       5795000             55000               5         136
## 3    11000000      11195000           -195000               7         165
## 4     6750000       6995000           -245000               6         145
## 5     5910000       5995000            -85000               5         156
## 6     5850000       5495000            355000               4         126
## # ℹ 11 more variables: secondary_area <dbl>, plot_area <dbl>, year_built <dbl>,
## #   operating_cost <dbl>, sale_date <date>, neighborhood <fct>, latitude <dbl>,
## #   longitude <dbl>, secondary_area_missing <dbl>, month <dbl>, year <dbl>

Data exploration

The response variable: house_price

# Plot distribution of house prices
ggplot(data_clean, aes(x = house_price)) +
    geom_histogram(fill = "#1B9E77") +
    geom_vline(aes(xintercept = mean(house_price, na.rm = TRUE)),
                color = "#e24b1d", linetype = "dashed", linewidth = 1) +
    geom_vline(aes(xintercept = median(house_price, na.rm = TRUE)),
                color = "#4a4aa7", linetype = "dashed", linewidth = 1) +
    scale_x_continuous(
      labels = scales::comma_format(scale = 1e-6, suffix = "M"),
      breaks = seq(0, max(data_clean$house_price, na.rm = TRUE), by = 0.5e7)) +
    bengtegard_theme() +
    labs(
      y = "Frequency", 
      x = "Final House Price in (MSEK)",
      caption = "Blue line = Median price | Red line = Mean price"
    )

# Calculate skewness
skew_value <- skewness(data_clean$house_price, na.rm = TRUE)
cat("Skewness of target variable: ", skew_value)
## Skewness of target variable:  2.297969

The target variable is right-skewed. Median is often a better measure of central tendency in skewed distributions. It is less sensitive to extreme outliers compared to the mean, which can be distorted by those outliers. In this case a skewness of 2.3 suggests that the distribution is skewed, so the median provides a more robust measure of central tendency. However, when doing regression modeling the response variable might later on benifit from log-transformation.

Log-tranformation of house_price

# Plot distribution of house prices
ggplot(data_clean, aes(x = log(house_price))) +
    geom_histogram(fill = "#1B9E77") +
    scale_x_continuous(
      labels = scales::comma_format(scale = 1e-6, suffix = "M"),
      breaks = seq(0, max(data_clean$house_price, na.rm = TRUE), by = 0.5e7)) +
    bengtegard_theme() +
    labs(
      y = "Frequency", 
      x = "Final House Price in (MSEK)"
    )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Looks more normally distributed after log-tranformation.

Extreme values in the response variable

# Zoom in on the distribution
ggplot(data_clean, aes(
        x = house_price,
        fill = (house_price == 20000000 | house_price > 20000000))) +
    geom_histogram(bins = 30) +
    scale_x_continuous(
      labels = scales::comma_format(scale = 1e-6, suffix = "M"),
      breaks = seq(0, max(data_clean$house_price, na.rm = TRUE), by = 0.5e7)) +
    coord_cartesian(xlim = c(1300000, 30000000),
                    ylim = c(0, 20)) +
    scale_fill_manual(values = c("TRUE" = "#FF6347", "FALSE" = "#1B9E77")) + 
    bengtegard_theme() +
    theme(legend.position = "bottom") +
    labs(
      y = "Frequency", 
      x = "Final House Price in (MSEK)",
    )

A total of ten house were sold for more than twenty millions (SEK). Dont know if we want to include those observations in our final analysis, because they might have a high leverage in our regression model. However, there is one house sold over 25 millions which will be removed since it will most probably affect the modeling later on.

data_clean <- data_clean |>
  filter(house_price < 25000000)

Create histograms for all numeric variables

data_clean |>
  select(where(is.numeric), -latitude, -longitude) |>
  gather(key = "Variable", value = "Value") |>  # Reshape for faceting
  ggplot(aes(x = Value)) + 
  geom_histogram(fill = "#1B9E77") + 
  facet_wrap(~Variable, scales = "free", ncol = 3) +
  bengtegard_theme() +
  labs(x = "Value", y = "Frequency")

How has the house price evolved over time?

# Aggregate data
data_summary <- data_clean |>
  group_by(year, month) |>
  summarise(avg_price = median(house_price, na.rm = TRUE)) |>
  ungroup()

# Create a complete grid of all year-month combinations (from 2020 to 2024)
complete_grid <- expand.grid(
  year = 2020:2024,  
  month = 1:12 
)

# Join the complete grid with your summarized data
data_complete <- complete_grid |>
  left_join(data_summary, by = c("year", "month")) |>
  mutate(avg_price = ifelse(is.na(avg_price), NA, avg_price),
        month = factor(month, levels = 1:12, labels = month.abb))

# Function for converting millions and add "millions (SEK)"
million_sek_format <- function(x) {
  paste0(format(x / 1e6, big.mark = ","), " millions (SEK)")
}

# Create the heatmap
ggplot(data_complete, aes(x = month, y = year, fill = avg_price)) +
  geom_tile() +
  scale_fill_viridis_c(labels = million_sek_format) +
  labs(
  title = "Monthly Median Sale Prices of Houses in Malmö (2020–2024)",
  subtitle = "Aggregated Based on Final Sale Prices (Median)",
    x = "Month",
    y = "Year",
    fill = "Average Sale Price",
    caption = "Data from Hemnet.se"
  ) +
  geom_text(aes(label = round((avg_price / 1e6), digits = 3)),
    color = "white"
  ) +
  bengtegard_theme()

Pandemic effects in 2020? We can clearly spot higher prices both for 2021 and 2022, and the general sense was that many people wanted to move to an house after the isolation period, which might explain the higher demands.

Which neighborhoods in Malmö is most expensive?

# Aggregate the data by neighborhood to get the average price
top10_neighborhoods <- data |>
  group_by(neighborhood) |>
  filter(
    (!(neighborhood %in% c("Torg", "By"))), # remove incorrect neighborhoods
    n() > 5, # keep only neighborhoods with 5+ sold houses
    latitude >= 0 & longitude >= 0) |>
  summarise(
    median_price = median(house_price, na.rm = TRUE),
    n = n()) |>
  arrange(desc(median_price)) |>
  slice_head(n = 10)

# Bar plot for the top 10 most expensive neighborhoods
ggplot(top10_neighborhoods, aes(x = reorder(neighborhood, median_price), y = median_price / 1e6, fill = median_price)) +
  geom_col() +  
  scale_fill_viridis_c(labels = million_sek_format, name = "") +  
  bengtegard_theme() +
  labs(
    title = "House Prices in the 10 Most Expensive Neighborhoods of Malmö",
    x = "Neighborhood", 
    y = "Median House Price in (MSEK)",
    caption = "Data from Hemnet.se") +
  geom_text(aes(label = median_price / 1e6),
    color = "#ffffff",
    hjust = 1.5
  ) +
  coord_flip()

What is the distribution of house prices in Malmö’s most expensive neighborhoods?

# Filter data to include only the top 10 expensive neighborhoods
top10_names <- top10_neighborhoods$neighborhood

data_top10 <- data_clean |>
  select(neighborhood, house_price) |>
  filter(neighborhood %in% top10_names)

# Ridge plot of price distributions
ggplot(data_top10, aes(x = house_price / 1e6, y = fct_reorder(neighborhood, house_price), fill = neighborhood)) +
  geom_jitter(data = data_top10, height = 0.1, alpha = 0.4, shape = 21) +
  geom_density_ridges(scale = 1.5, alpha = 0.7, color = "white") +
  scale_x_continuous(name = "Final House Price (MSEK)") +
  scale_y_discrete(name = "Neighborhood") +
  bengtegard_theme() +
  labs(
    title = "Distribution of House Prices in Malmö's Most Expensive Neighborhoods",
    caption = "Data from Hemnet.se"
  ) +
  theme(legend.position = "none")

Regression analysis

I’ve chosen to drop both listing_price and price_development from the regression analysis, as these variables are directly derived from or highly correlated with the response variable house_price. Including them would introduce data leakage, leading to an overestimation of model performance and reducing its generalizability in real-world scenarios where the house price is not yet known.

The goal of the regression is twofold:

Explanation: To identify key variables that significantly contribute to explaining the variation in house prices in Malmö, focusing on factors like living area, number of rooms, and neighborhood differences.

Prediction: To build a model that can predict house prices based on these variables, helping to estimate the value of properties in Malmö under different conditions.

Split data into three sets: train, test and validation - stratified by neighborhood.

data_regression <- data_clean |>
  select(-listing_price, -price_development)

# Stratified split: Create training set (60%) stratified by 'neighborhood'
train_index <- createDataPartition(data_regression$neighborhood, p = 0.6, list = FALSE)
houses_train <- data_regression[train_index, ]

remaining_data <- data_regression[-train_index, ]

# Stratified split: Create test and validation sets (20% each, total 40% for test+validate)
test_val_index <- createDataPartition(remaining_data$neighborhood, p = 0.5, list = FALSE)
houses_test <- remaining_data[test_val_index, ]
houses_validate <- remaining_data[-test_val_index, ]

# Check the proportions and number of rows
data_split_proportions <- sapply(list(train = houses_train, test = houses_test, validate = houses_validate), nrow) / nrow(data_regression)
data_split_proportions
##     train      test  validate 
## 0.6153511 0.1997383 0.1849106

Correlations with house_price

cor_matrix <- data_regression |> 
  select(where(is.numeric)) |> 
  cor() # calculate correlation 

# Order by house_price correlation
ordered_vars <- names(sort(cor_matrix[, "house_price"], decreasing = TRUE)) 

# Reorder the correlation matrix
cor_matrix <- cor_matrix[ordered_vars, ordered_vars]

# Plot correlation matrix using corrplot
corrplot::corrplot(cor_matrix, method = "color", type = "upper", 
                   addCoef.col = "black", number.cex = 0.7, tl.cex = 0.8,
                   tl.col = "black", diag = FALSE)

Plotting the most promising predictors

The most promising predictors for house price is:

living_area (r = 0.67)

number_of_rooms (r = 0.49)

operating_cost (r = 0.38)

plot_area (r = 0.32)

The numeric variable with the highest correlation with house_price is the living_area. This make a lot of sense; big houses are generally more expensive.

pred1 <- ggplot(data_regression, aes(x = living_area, y = house_price / 1e6)) +
  geom_point(alpha = 0.5, color = "#1B9E77") +
  stat_poly_line() +
  stat_poly_eq(use_label(c("R2"))) +
  geom_text_repel(
    aes(label = ifelse(living_area > 360, rownames(data_clean), "")),
    size = 3,
    color = "red"
  ) +
  labs(
    x = "Living Area (m²)",
    y = "House Price (MSEK)",
    title = "House Price vs Living Area with Outliers Labeled"
  ) +
  bengtegard_theme()

pred1

The scatter plot with line of best fit (95% confidence interval) shows a strong, positive relationship between the response variable house_price and the predictor living_area (with an model marginal of R2 = 0.45). A possible high leverage point is labeled to (row 1530) and a possible outlier (row 303). Even though it’s an its an high-value, it follows the regression line.

The variable with the second highest correlation with house_price is the number_of_rooms. Most likely the number_of_rooms (including living_area) works as an proxy for the size of the house. Which is not that suprising.

pred2 <- ggplot(data_regression, aes(
  x = factor(number_of_rooms), 
  y = house_price / 1e6)) +
  geom_point(alpha = 0.5, size = 2, color = "#1B9E77") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    x = "Number of Rooms",
    y = "House Price (MSEK)"
  ) +
  bengtegard_theme()

pred2

From this boxplot we can see that there is a positive relationship between the predictor number_of_rooms and the response variable house_price. An increased number of rooms in a house is associated with an higher price. However, one house with 12 rooms and two houses with 13 rooms does not seem to follow this trend.

The third best predictor for house_price is operating_cost. Higher operating costs might be associated with larger homes as well as premium homes with pools, saunas, underfloor heating, etc. So it might both be a proxy for the size of the house and its quality/luxury.

pred3 <- ggplot(data_regression, aes(x = operating_cost, y = house_price / 1e6)) +
  geom_point(alpha = 0.5, color = "#1B9E77") +
  stat_poly_line() +
  stat_poly_eq(use_label(c("R2"))) +
  geom_text_repel(
    aes(label = ifelse(operating_cost > 150000, rownames(data_clean), "")),
    size = 3,
    color = "red"
  ) +
  labs(
    x = "Operating Cost",
    y = "House Price (MSEK)"
  ) +
  bengtegard_theme()

pred3

The scatter plot with line of best fit (95% confidence interval) shows a weak, positive relationship between the response variable house_price and the predictor plot_area (with an model marginal of R2 = 0.11). Two possible high leverage point is labeled (rows 1530 and 745 ) and one potential outlier (row 303)

The fourth predictor for house_price is plot_area. Larger plots often allow for bigger homes, better location and more privacy. As such, land value itself can be a big contributor to price, especially in high-demand areas.

pred4 <- ggplot(data_regression, aes(x = plot_area, y = house_price / 1e6)) +
  geom_point(alpha = 0.5, color = "#1B9E77") +
  stat_poly_line() +
  stat_poly_eq(use_label(c("R2"))) +
  geom_text_repel(
    aes(label = ifelse(plot_area > 2300, rownames(data_clean), "")),
    size = 3,
    color = "red"
  ) +
  labs(
    x = "Plot Area (m²)",
    y = "House Price (MSEK)"
  ) +
  bengtegard_theme()

pred4

The scatter plot with line of best fit (95% confidence interval) shows a moderate, positive relationship between the response variable house_price and the predictor operating_cost (with an model marginal of R2 = 0.15). Seven possible outliers or high leverage points is labeled (rows 765, 753, 384, 1684, 953, 952 and 1997).

Modeling

Lets first fit the full model and check its model summary and assumptions.

# Full model formula
full_model <- lm(house_price ~ living_area + number_of_rooms + 
              operating_cost + plot_area +  year +
              year_built + secondary_area + secondary_area_missing + neighborhood,
              data = houses_train)

# Model summary
summary(full_model)
## 
## Call:
## lm(formula = house_price ~ living_area + number_of_rooms + operating_cost + 
##     plot_area + year + year_built + secondary_area + secondary_area_missing + 
##     neighborhood, data = houses_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5924747  -616542    -6703   588391  9045025 
## 
## Coefficients:
##                                   Estimate     Std. Error t value
## (Intercept)                 -168294963.627   42363524.396  -3.973
## living_area                      21608.405       1370.464  15.767
## number_of_rooms                  80031.324      39422.088   2.030
## operating_cost                       8.081          3.101   2.606
## plot_area                         1208.625        135.659   8.909
## year                             74708.237      20943.961   3.567
## year_built                        9069.749       1250.153   7.255
## secondary_area                    3976.416       1076.806   3.693
## secondary_area_missing          -14161.281      89725.433  -0.158
## neighborhoodAlmvik             -602753.397     875413.942  -0.689
## neighborhoodÄngar              1337676.290    1005239.061   1.331
## neighborhoodAnnetorp           2209714.186    1001105.622   2.207
## neighborhoodBellevue           1539666.423     588768.219   2.615
## neighborhoodBellevuesjösida    9558640.366    1322104.316   7.230
## neighborhoodBergdala           -198017.664     727087.200  -0.272
## neighborhoodBulltofta          -848254.557     762263.399  -1.113
## neighborhoodBunkeflo           2339378.814    1310208.107   1.786
## neighborhoodBunkeflostrand      919261.575     547970.890   1.678
## neighborhoodCity               4300930.368    1007046.957   4.271
## neighborhoodDammfri           -1414389.009    1327471.738  -1.065
## neighborhoodDjupadal           1002867.876     562299.153   1.784
## neighborhoodEkoby              -237189.650    1005275.133  -0.236
## neighborhoodElinegård          1214650.374    1319241.700   0.921
## neighborhoodEllenborg          -801789.658     874214.600  -0.917
## neighborhoodEriksfält             5065.507     579586.728   0.009
## neighborhoodFårabäck          -1819374.933    1318338.763  -1.380
## neighborhoodFosie             -3607074.900    1314068.721  -2.745
## neighborhoodFridhem            5221785.932     742452.075   7.033
## neighborhoodGlostorp          -1049177.752    1003430.317  -1.046
## neighborhoodGröndal             835226.034    1312062.836   0.637
## neighborhoodGullvik            -749304.586     580604.387  -1.291
## neighborhoodGullviksborg      -1749485.578    1311349.109  -1.334
## neighborhoodHåkanstorp         1209353.247     668605.041   1.809
## neighborhoodHamnen             6022579.915     733389.190   8.212
## neighborhoodHemgården           -42831.115    1314512.274  -0.033
## neighborhoodHindby             -764753.308     619801.931  -1.234
## neighborhoodHöja               -185088.823    1311635.658  -0.141
## neighborhoodHusie              -141438.539     555709.922  -0.255
## neighborhoodHyllie              628993.038     657516.015   0.957
## neighborhoodHyllieby            833280.200     669670.306   1.244
## neighborhoodJägersro          -1021586.631    1002030.080  -1.020
## neighborhoodJohanneslust       1458354.631     669205.458   2.179
## neighborhoodKäglinge           -810150.305     569857.223  -1.422
## neighborhoodKastanjegården    -1249419.112     599218.570  -2.085
## neighborhoodKattarp            -694314.225     590764.970  -1.175
## neighborhoodKirseberg           915158.417     702718.879   1.302
## neighborhoodKlagshamn           801338.509     562757.707   1.424
## neighborhoodKlagstorp           -58473.150     806311.791  -0.073
## neighborhoodKristineberg       -472314.209     574932.565  -0.822
## neighborhoodKroksbäck           134522.273     803592.850   0.167
## neighborhoodKulladal             32054.349     562403.478   0.057
## neighborhoodKungshälla         -650191.869     875685.345  -0.742
## neighborhoodKvarnby             216336.467     606297.218   0.357
## neighborhoodKvissle           -4738149.584    1102585.127  -4.297
## neighborhoodKyrkby             -907012.581     567068.996  -1.599
## neighborhoodLimhamn            2762595.798     549621.542   5.026
## neighborhoodLindeborg          -469529.319     759552.753  -0.618
## neighborhoodLockarp           -1161734.750    1004260.405  -1.157
## neighborhoodMellanheden        4299538.022     611524.753   7.031
## neighborhoodNydala              -32630.350    1002324.402  -0.033
## neighborhoodÖn                 2828052.490    1005933.969   2.811
## neighborhoodOxie              -1142881.692     573929.250  -1.991
## neighborhoodRiseberga          -219970.218     566327.694  -0.388
## neighborhoodRosengård          -533735.175     756493.784  -0.706
## neighborhoodRosenvång          3152365.439     588859.162   5.353
## neighborhoodRostorp            1524344.730     629056.820   2.423
## neighborhoodSallerup          -1334066.009    1002475.857  -1.331
## neighborhoodSegevång           -601083.540    1004787.834  -0.598
## neighborhoodSibbarp            2680880.299     619398.830   4.328
## neighborhoodSjösida            6014307.814     578683.072  10.393
## neighborhoodSjöstad            2454078.287     732882.463   3.349
## neighborhoodSkrävlinge           79219.257    1000383.001   0.079
## neighborhoodSkumparp            195694.153     807775.423   0.242
## neighborhoodSöderkulla         -534321.723     803579.801  -0.665
## neighborhoodSofielund          1129467.158     612533.823   1.844
## neighborhoodSolbacken          2626861.454     624646.277   4.205
## neighborhoodStenkällan        -1064059.771     683534.450  -1.557
## neighborhoodStrandhem           677956.285     603111.210   1.124
## neighborhoodToarp             -1437533.445     805997.889  -1.784
## neighborhoodToftanäs          -3479542.455    1318372.915  -2.639
## neighborhoodToftängen         -1331612.228    1313124.885  -1.014
## neighborhoodTullstorp          -681295.372     881485.536  -0.773
## neighborhoodTygelsjö             80232.413     559631.096   0.143
## neighborhoodUlricedal          -779241.088     684926.706  -1.138
## neighborhoodValdemarsro        -203812.276     645796.351  -0.316
## neighborhoodVårsången         -1205715.832    1005174.930  -1.200
## neighborhoodVäster             8035482.450    1316810.369   6.102
## neighborhoodVästervång         3515736.499     725661.455   4.845
## neighborhoodVidedal            -192705.968     573463.087  -0.336
## neighborhoodVillastad          -461190.951     583591.866  -0.790
## neighborhoodVintrie            1179209.484     622050.427   1.896
## neighborhoodVirentofta         -237964.961     569450.173  -0.418
##                                         Pr(>|t|)    
## (Intercept)                 0.000074924120833226 ***
## living_area                 < 0.0000000000000002 ***
## number_of_rooms                         0.042545 *  
## operating_cost                          0.009260 ** 
## plot_area                   < 0.0000000000000002 ***
## year                                    0.000374 ***
## year_built                  0.000000000000683484 ***
## secondary_area                          0.000231 ***
## secondary_area_missing                  0.874616    
## neighborhoodAlmvik                      0.491237    
## neighborhoodÄngar                       0.183516    
## neighborhoodAnnetorp                    0.027467 *  
## neighborhoodBellevue                    0.009023 ** 
## neighborhoodBellevuesjösida 0.000000000000816329 ***
## neighborhoodBergdala                    0.785400    
## neighborhoodBulltofta                   0.265993    
## neighborhoodBunkeflo                    0.074410 .  
## neighborhoodBunkeflostrand              0.093667 .  
## neighborhoodCity            0.000020870092609582 ***
## neighborhoodDammfri                     0.286856    
## neighborhoodDjupadal                    0.074733 .  
## neighborhoodEkoby                       0.813512    
## neighborhoodElinegård                   0.357366    
## neighborhoodEllenborg                   0.359229    
## neighborhoodEriksfält                   0.993028    
## neighborhoodFårabäck                    0.167805    
## neighborhoodFosie                       0.006134 ** 
## neighborhoodFridhem         0.000000000003232821 ***
## neighborhoodGlostorp                    0.295941    
## neighborhoodGröndal                     0.524512    
## neighborhoodGullvik                     0.197083    
## neighborhoodGullviksborg                0.182398    
## neighborhoodHåkanstorp                  0.070714 .  
## neighborhoodHamnen          0.000000000000000513 ***
## neighborhoodHemgården                   0.974012    
## neighborhoodHindby                      0.217472    
## neighborhoodHöja                        0.887802    
## neighborhoodHusie                       0.799135    
## neighborhoodHyllie                      0.338934    
## neighborhoodHyllieby                    0.213605    
## neighborhoodJägersro                    0.308145    
## neighborhoodJohanneslust                0.029491 *  
## neighborhoodKäglinge                    0.155358    
## neighborhoodKastanjegården              0.037254 *  
## neighborhoodKattarp                     0.240095    
## neighborhoodKirseberg                   0.193037    
## neighborhoodKlagshamn                   0.154698    
## neighborhoodKlagstorp                   0.942200    
## neighborhoodKristineberg                0.411503    
## neighborhoodKroksbäck                   0.867080    
## neighborhoodKulladal                    0.954558    
## neighborhoodKungshälla                  0.457920    
## neighborhoodKvarnby                     0.721287    
## neighborhoodKvissle         0.000018557882579018 ***
## neighborhoodKyrkby                      0.109955    
## neighborhoodLimhamn         0.000000568590607771 ***
## neighborhoodLindeborg                   0.536573    
## neighborhoodLockarp                     0.247561    
## neighborhoodMellanheden     0.000000000003284917 ***
## neighborhoodNydala                      0.974035    
## neighborhoodÖn                          0.005006 ** 
## neighborhoodOxie                        0.046651 *  
## neighborhoodRiseberga                   0.697772    
## neighborhoodRosengård                   0.480600    
## neighborhoodRosenvång       0.000000101736169048 ***
## neighborhoodRostorp                     0.015517 *  
## neighborhoodSallerup                    0.183494    
## neighborhoodSegevång                    0.549796    
## neighborhoodSibbarp         0.000016168935493323 ***
## neighborhoodSjösida         < 0.0000000000000002 ***
## neighborhoodSjöstad                     0.000835 ***
## neighborhoodSkrävlinge                  0.936894    
## neighborhoodSkumparp                    0.808614    
## neighborhoodSöderkulla                  0.506214    
## neighborhoodSofielund                   0.065418 .  
## neighborhoodSolbacken       0.000027825442090001 ***
## neighborhoodStenkällan                  0.119781    
## neighborhoodStrandhem                   0.261176    
## neighborhoodToarp                       0.074728 .  
## neighborhoodToftanäs                    0.008406 ** 
## neighborhoodToftängen                   0.310731    
## neighborhoodTullstorp                   0.439723    
## neighborhoodTygelsjö                    0.886023    
## neighborhoodUlricedal                   0.255452    
## neighborhoodValdemarsro                 0.752357    
## neighborhoodVårsången                   0.230546    
## neighborhoodVäster          0.000000001372567926 ***
## neighborhoodVästervång      0.000001416923379547 ***
## neighborhoodVidedal                     0.736895    
## neighborhoodVillastad                   0.429516    
## neighborhoodVintrie                     0.058221 .  
## neighborhoodVirentofta                  0.676099    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1195000 on 1319 degrees of freedom
## Multiple R-squared:  0.8032, Adjusted R-squared:  0.7896 
## F-statistic: 59.14 on 91 and 1319 DF,  p-value: < 0.00000000000000022
# Check model assumptions with diagnostic plots
check_model(full_model)

Model assumptions:

  1. Posterior Predictive Check (Top Left)

The predicted distribution (blue) follows the observed one (green) fairly well. This suggests that the model captures the overall shape of the outcome distribution.

  1. Linearity (Top Right)

The residuals vs fitted plot shows som curvature, suggesting:

  • Non-linearity between predictors and response variable
  • Possibly non-linear relationships (like log or polynomial terms)
  1. Homescedasticity (Middle Left)

Heteroscedasticity present - residuals fan out as fitted values increase.

  1. Influential observations (Middle Right)

A few points have high leverage and high standardized residuals (those labeled: e.g., 848, 872, 195). They’re not dramatically off the charts, but worth checking.

  1. Collinearity (Bottom Left)

All Variance Inflation Factor (VIF) values are below 5 (a common threshold for concern) except one. No strong multicollinearity.

  1. Normality of Residuals (Bottom Right)

Some Skewness — The residuals deviate from the QQ-line at the tails (i.e., not perfectly normal).

Multiple regression model with log-transformed response variable

The diagnostic plots suggests that there might exist a non-linear relationship between house_price and the predictors and that the model fit has room for improvement. By log-tranforming the response variable we might be able to stabilize variance and also handle outliers better.

# Model formula with log-tranformed Y variable house_price
log_model <- lm(log(house_price) ~ living_area + number_of_rooms + 
              operating_cost + plot_area +
              year_built + year + secondary_area + secondary_area_missing + neighborhood,
              data = houses_train)

# Check model assumptions with diagnostic plots
check_model(log_model)

From the diagnostic plots it becomes apperent that the model fits better with an log-transformed response variable.

Types of Regression Models

# Model 1: Linear regression with only living_area
linear_model <- lm(log(house_price) ~ living_area, data = houses_train)

# Model 2: Multiple linear regression with all numeric predictors
mlr_model <- lm(log(house_price) ~ living_area + number_of_rooms +
  operating_cost + plot_area + year_built + year +  secondary_area + secondary_area_missing, 
  data = houses_train)

# Model 3: Multiple linear regression with all predictors (including categorical variable neighborhood)
mlr_neighborhood <- lm(log(house_price) ~ living_area + number_of_rooms +
  operating_cost + plot_area + year_built + year + secondary_area + secondary_area_missing + neighborhood, 
  data = houses_train)

# Model 4: Mixed-effects model with neighborhood as random effect
mixed_model <- lmer(log(house_price) ~ living_area + number_of_rooms + 
  operating_cost + plot_area + year_built + year + secondary_area + secondary_area_missing + (1 | neighborhood), 
  data = houses_train)

Model Evaluation

# Predictions
val_pred_m1 <- predict(linear_model, newdata = houses_validate)
val_pred_m2 <- predict(mlr_model, newdata = houses_validate)
val_pred_m3 <- predict(mlr_neighborhood, newdata = houses_validate)
val_pred_m4 <- predict(mixed_model, newdata = houses_validate, allow.new.levels = TRUE)

# Actual log house prices
actual_values <- log(houses_validate$house_price)

# Calculate RMSE for each model
rmse_m1 <- rmse(actual_values, val_pred_m1)
rmse_m2 <- rmse(actual_values, val_pred_m2)
rmse_m3 <- rmse(actual_values, val_pred_m3)
rmse_m4 <- rmse(actual_values, val_pred_m4)

# Calculate range of the actual values
val_range <- max(actual_values) - min(actual_values)

# Normalize RMSE using the range (min-max)
nrmse_m1 <- rmse_m1 / val_range
nrmse_m2 <- rmse_m2 / val_range
nrmse_m3 <- rmse_m3 / val_range
nrmse_m4 <- rmse_m4 / val_range

# Format NRMSE for display
nrmse_fmt <- function(x) paste0(round(x, 4), " (", round(x * 100, 2), "%)")

# Get conditional and marginal R² for mixed model
r2_mixed <- r.squaredGLMM(mixed_model)

# Build performance table
regression_results <- data.frame(
  Model = c("Linear Regression (Living Area)", 
            "Multiple Linear Regression (Numerical Predictors)", 
            "Multiple Linear Regression (including Neighborhood)",
            "Mixed Effects Model"),
  `Residual Standard Error` = c(
    sigma(linear_model),
    sigma(mlr_model),
    sigma(mlr_neighborhood),
    sigma(mixed_model)
  ),
  `R squared` = c(
    summary(linear_model)$r.squared,
    summary(mlr_model)$r.squared,
    summary(mlr_neighborhood)$r.squared,
    r2_mixed[1, 2]  # Conditional R2
  ),
  `Adjusted R squared` = c(
    summary(linear_model)$adj.r.squared,
    summary(mlr_model)$adj.r.squared,
    summary(mlr_neighborhood)$adj.r.squared,
    r2_mixed[1, 1]  # Marginal R2
  ),
  AIC = c(
    AIC(linear_model),
    AIC(mlr_model),
    AIC(mlr_neighborhood),
    AIC(mixed_model)
  ),
  BIC = c(
    BIC(linear_model),
    BIC(mlr_model),
    BIC(mlr_neighborhood),
    BIC(mixed_model)
  ),
  RMSE = c(
    rmse_m1,
    rmse_m2,
    rmse_m3,
    rmse_m4
  ),
  NRMSE = c(
    nrmse_fmt(nrmse_m1),
    nrmse_fmt(nrmse_m2),
    nrmse_fmt(nrmse_m3),
    nrmse_fmt(nrmse_m4)
  ),
  check.names = FALSE
)

# Output HTML table with footnotes
kable(regression_results, format = "html", caption = "Model Performance Comparison on Validation set") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, position = "center") %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:ncol(regression_results), width = "150px") %>%
  footnote(
    general = "The mixed effect model shows the variance explained by fixed effects alone (Marginal R² = 0.2474) and by both fixed and random effects (Conditional R² = 0.7864)",
    general_title = "Note:",
    footnote_as_chunk = TRUE,
    escape = FALSE
  )
Model Performance Comparison on Validation set
Model Residual Standard Error R squared Adjusted R squared AIC BIC RMSE NRMSE
Linear Regression (Living Area) 0.2816937 0.4365654 0.4361655 432.9530 448.7092 0.2664160 0.1202 (12.02%)
Multiple Linear Regression (Numerical Predictors) 0.2765542 0.4596356 0.4565522 387.9626 440.4831 0.2590940 0.1169 (11.69%)
Multiple Linear Regression (including Neighborhood) 0.1822204 0.7792927 0.7640657 -709.4557 -221.0146 0.1682445 0.0759 (7.59%)
Mixed Effects Model 0.1825548 0.7864598 0.2474114 -409.9597 -352.1871 0.1687955 0.0762 (7.62%)
Note: The mixed effect model shows the variance explained by fixed effects alone (Marginal R² = 0.2474) and by both fixed and random effects (Conditional R² = 0.7864)

The Mixed Effects Model performs similarly to the best model in terms of error (RMSE ≈ 0.17), but has the added benefit of easier interpretation. Unlike the multiple linear regression with 90 neighborhood dummy variables, the mixed model uses random effects for neighborhoods, making it more efficient and readable. It explains ~78.6% of the variance (R²), with a low NRMSE of 7.6%. This balance of performance and simplicity is why it was chosen as the final model.

validation_results <- bind_rows(
  tibble(model = "Linear", pred = val_pred_m1),
  tibble(model = "Multiple", pred = val_pred_m2),
  tibble(model = "MLR + Neighborhood", pred = val_pred_m3),
  tibble(model = "Mixed", pred = val_pred_m4)
) |>
  mutate(actual = rep(log(houses_validate$house_price), 4),
         model = factor(model),
         price_actual = exp(actual),
         price_pred = exp(pred))

rmse_lookup <- tibble(
  model = factor(c("Linear", "Multiple", "MLR + Neighborhood", "Mixed")),
  dist = -c(rmse_m1, rmse_m2, rmse_m3, rmse_m4)
)

validation_results <- left_join(validation_results, rmse_lookup, by = "model")

ggplot(validation_results, aes(x = price_actual / 1e6, y = price_pred / 1e6)) +
  geom_point(alpha = 0.5, color = "black") + 
  geom_smooth(method = "lm", se = FALSE, aes(group = model, color = model), alpha = 0.7, linewidth = 1.2) +  # Fitted lines for each model
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "#ff0000") +  
  labs(
    title = "Comparison of Fitted Regression Lines Across Models",
    x = "Actual Price (MSEK)",
    y = "Predicted Price (MSEK)",
    color = "Model"
  ) +
  scale_color_manual(values = c("Linear" = "#4b4bc2", 
                                "Multiple" = "#d45b5b", 
                                "MLR + Neighborhood" = "#88e088", 
                                "Mixed" = "#aa57dd")) +
  bengtegard_theme() +
  theme(
    legend.position = "bottom", 
    plot.title = element_text(hjust = 0.5),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12)
  ) +
  coord_fixed(ratio = 1)
## `geom_smooth()` using formula = 'y ~ x'

Final Model Choice - Linear Mixed-Effect Model

Outlier analysis

# Diagnostic model information
augmented_mixed_model <- augment(mixed_model)

# Calculate studentized residuals
std_cond_resid <- rstudent(mixed_model)

# Get residual standard deviation
resid_sd <- sigma(mixed_model) 

# Add studentized conditional residuals manually
augmented_mixed_model <- augmented_mixed_model |>
  mutate(.std.cond.resid = std_cond_resid)

# Calculate studentized conditional residuals above > 3
outliers_std_cond <- augmented_mixed_model |>
  filter(abs(.std.cond.resid) > 3) 

# Print outliers based on studentized conditional residuals
print(outliers_std_cond)
## # A tibble: 10 × 22
##    `log(house_price)` living_area number_of_rooms operating_cost plot_area
##                 <dbl>       <dbl>           <dbl>          <dbl>     <dbl>
##  1               14.9         165               8          33500       476
##  2               15.7         365              13         157837       910
##  3               14.7         110               5          22200      1454
##  4               15.4         180               5          48885       690
##  5               15.3         270               7          49718      2473
##  6               16.0         170               7          29700       463
##  7               16.9         528              14         159686      1626
##  8               16.3         133               5          36600       104
##  9               15.8         148               5          34868      1069
## 10               13.8          46               2           6720       907
## # ℹ 17 more variables: year_built <dbl>, year <dbl>, secondary_area <dbl>,
## #   secondary_area_missing <dbl>, neighborhood <fct>, .fitted <dbl>,
## #   .resid <dbl>, .hat <dbl>, .cooksd <dbl>, .fixed <dbl>, .mu <dbl>,
## #   .offset <dbl>, .sqrtXwt <dbl>, .sqrtrwt <dbl>, .weights <dbl>,
## #   .wtres <dbl>, .std.cond.resid <dbl>

A total of 10 observations show standardized residuals above 3, which are typically considered strong outliers. Most of these correspond to houses in the higher price range and appear to reflect genuine variation rather than data issues. Since the model is intended to both explain and predict house prices, removing these observations could reduce its generalizability. However, a few cases display exceptionally large residuals and may warrant further investigation. This can be further explored through the residual plots.

library(ggrepel)

ggplot(augmented_mixed_model, aes(x = .fitted, y = .std.cond.resid)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red") +
  geom_hline(yintercept = c(-2, 2), color = "blue", linetype = "dashed") +
  geom_text_repel(
    aes(label = ifelse(abs(.std.cond.resid) > 3.3, rownames(augmented_mixed_model), "")),
    size = 3,
    color = "red"
  ) +
  labs(
    x = "Fitted Values", 
    y = "Studentized Conditional Residuals",
    title = "Studentized Residuals vs Fitted Values"
  )

From the residual plot, we can identify four observations that stand out from the rest: rows 1357, 192, 928, and 1267. All four have standardized residuals greater than 3.3 in absolute value, indicating that they deviate significantly from the model’s predictions. These observations will be considered outliers or influential points, as they may disproportionately impact the model’s estimates and potentially bias the results.

Compare Models after removing outliers

# Remove the outliers
houses_train_cleaned <- houses_train[-c(1348, 184, 943, 340, 1262, 678, 829), ]

# Refit the mixed model without the outliers
mixed_model_clean <- lmer(log(house_price) ~ living_area + number_of_rooms + 
                          operating_cost + plot_area + year_built + year + secondary_area + 
                          secondary_area_missing + (1 | neighborhood), 
                          data = houses_train_cleaned)


# Compare AIC and BIC for both models
aic_comparison <- AIC(mixed_model, mixed_model_clean)
bic_comparison <- BIC(mixed_model, mixed_model_clean)

# Display AIC and BIC
print("AIC Comparison:")
## [1] "AIC Comparison:"
print(aic_comparison)
##                   df       AIC
## mixed_model       11 -409.9597
## mixed_model_clean 11 -510.0302
print("BIC Comparison:")
## [1] "BIC Comparison:"
print(bic_comparison)
##                   df       BIC
## mixed_model       11 -352.1871
## mixed_model_clean 11 -452.3123
# Calculate R² values (Marginal and Conditional) for both models
r2_mixed_model <- performance::r2(mixed_model)
r2_mixed_model_clean <- performance::r2(mixed_model_clean)

# Display R² values
cat("\nR² Values (Marginal / Conditional):\n")
## 
## R² Values (Marginal / Conditional):
cat("Original Mixed Model: ", round(r2_mixed_model$R2_marginal, 3), "/", round(r2_mixed_model$R2_conditional, 3), "\n")
## Original Mixed Model:  0.247 / 0.786
cat("Cleaned Mixed Model:  ", round(r2_mixed_model_clean$R2_marginal, 3), "/", round(r2_mixed_model_clean$R2_conditional, 3), "\n")
## Cleaned Mixed Model:   0.236 / 0.804

Removing the outliers improves both AIC and conditional R², while lowering the BIC. Therefore, the model without outliers is preferable for further analysis and prediction.

Test Best Model

# Predict on the test set for final unbiased evaluation
final_predictions <- predict(mixed_model_clean, newdata = houses_test, allow.new.levels = TRUE)

predicted_prices_test <- exp(final_predictions) # transform to original scale
actual_prices_test <- houses_test$house_price

# Plot Predicted Price
ggplot(houses_test, aes(x = actual_prices_test / 1e6, y = predicted_prices_test / 1e6)) +
  geom_point(alpha = 0.6, color = "#1B9E77") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray30") +
  labs(
    x = "Actual House Price (MSEK)",
    y = "Predicted House Price (MSEK)",
    title = "Final Model Predictions on Test Set",
    caption = "Dashed line shows perfect predictions (y = x)."
  ) +
  bengtegard_theme()
## `geom_smooth()` using formula = 'y ~ x'

Check Model assumption

check_model(mixed_model_clean)

Model assumptions

Posterior Predictive Check: The predicted data closely follows the observed log-transformed house prices, suggesting good model fit.

Linearity: The residuals appear randomly scattered around zero, indicating that the linearity assumption holds.

Collinearity (VIF): All VIF values are below 5, suggesting no serious multicollinearity among predictors.

High leverage points: There is some data points that exhibit high leverage and might be due to houses within the higher price range.

Normality of Residuals And Random Effects: The Q-Q plot shows that most residuals lie close to the line for both fixed effects and random effects, suggesting normality.

Residual Distribution

# Diagnostic model information
augmented_mixed_model_clean <- augment(mixed_model_clean)

ggplot(augmented_mixed_model_clean, aes(x = .resid)) +
  geom_density(fill = "darkblue", alpha = 0.7) +
  labs(
    title = "Residual Distribution",
    x = "Residuals",
    y = "Frequency"
  ) 

Residuals looks normally distributed. Nice!

Final Model Summary

final_rmse <- rmse(actual_prices_test, predicted_prices_test)

# Calculate value range for NRMSE
test_range <- max(actual_prices_test) - min(actual_prices_test)
nrmse_final <-  final_rmse / test_range

# Conditional and Marginal R² for final model
r2_final <- r.squaredGLMM(mixed_model_clean)

final_model_summary <- data.frame(
  Metric = c(
    "Residual Standard Error",
    "Marginal R² (Fixed effects)",
    "Conditional R² (Fixed + Random effects)",
    "AIC",
    "BIC",
    "RMSE (in millions SEK)",
    "NRMSE"
  ),
  Value = c(
    round(sigma(mixed_model), 3),
    round(r2_final[1, 1], 4),
    round(r2_final[1, 2], 4),
    round(AIC(mixed_model), 1),
    round(BIC(mixed_model), 1),
    round(final_rmse),  # RMSE in millions SEK, rounded to 2 decimal places
    nrmse_fmt(nrmse_final)
  )
)

# Display the performance summary table
knitr::kable(final_model_summary, caption = "Performance Summary of Final Mixed Effects Model")
Performance Summary of Final Mixed Effects Model
Metric Value
Residual Standard Error 0.183
Marginal R² (Fixed effects) 0.2355
Conditional R² (Fixed + Random effects) 0.8037
AIC -410
BIC -352.2
RMSE (in millions SEK) 1113164
NRMSE 0.0643 (6.43%)

Wald-chi square tests for significance of fixed effects

Anova(mixed_model_clean, type = 3) 
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: log(house_price)
##                           Chisq Df            Pr(>Chisq)    
## (Intercept)              3.7539  1             0.0526859 .  
## living_area            199.8925  1 < 0.00000000000000022 ***
## number_of_rooms         14.6451  1             0.0001298 ***
## operating_cost           3.6486  1             0.0561159 .  
## plot_area               80.1843  1 < 0.00000000000000022 ***
## year_built             110.4281  1 < 0.00000000000000022 ***
## year                    13.7676  1             0.0002069 ***
## secondary_area           7.8585  1             0.0050581 ** 
## secondary_area_missing   1.3920  1             0.2380706    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Most variables significantly affect house prices. Living area, plot area, year built, year of sale, and number of rooms have strong effects. Operating cost is borderline, and missing secondary area is not significant.

summary(mixed_model_clean)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(house_price) ~ living_area + number_of_rooms + operating_cost +  
##     plot_area + year_built + year + secondary_area + secondary_area_missing +  
##     (1 | neighborhood)
##    Data: houses_train_cleaned
## 
## REML criterion at convergence: -532
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.7495 -0.6128  0.0301  0.6282  3.6099 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  neighborhood (Intercept) 0.08889  0.2981  
##  Residual                 0.03072  0.1753  
## Number of obs: 1404, groups:  neighborhood, 84
## 
## Fixed effects:
##                              Estimate     Std. Error t value
## (Intercept)            -12.0414335979   6.2149758156  -1.937
## living_area              0.0028605276   0.0002023242  14.138
## number_of_rooms          0.0221062676   0.0057765514   3.827
## operating_cost           0.0000008743   0.0000004577   1.910
## plot_area                0.0001755991   0.0000196100   8.955
## year_built               0.0019286646   0.0001835342  10.508
## year                     0.0114041473   0.0030735013   3.710
## secondary_area           0.0004444804   0.0001585556   2.803
## secondary_area_missing  -0.0155389493   0.0131705752  -1.180
## 
## Correlation of Fixed Effects:
##             (Intr) lvng_r nmbr__ oprtn_ plot_r yr_blt year   scndr_
## living_area -0.058                                                 
## numbr_f_rms -0.010 -0.679                                          
## opertng_cst  0.238 -0.192 -0.041                                   
## plot_area    0.044 -0.240  0.016 -0.028                            
## year_built  -0.039 -0.266  0.041  0.130  0.280                     
## year        -0.998  0.073  0.006 -0.247 -0.061 -0.019              
## secondary_r -0.071 -0.013 -0.002 -0.106 -0.233  0.063  0.067       
## scndry_r_ms  0.020 -0.048  0.054 -0.021  0.028 -0.052 -0.018  0.483
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling

In the final model, all main predictors except operating cost and secondary_area_missing were statistically significant at the 0.05 level. A 1 m² increase in living area was associated with an approximate 0.29% increase in house price, while one additional room increased price by 2.23%. Each extra square meter of plot area was linked to a 0.018% price increase, and homes built more recently sold for more — with a 0.19% increase per year built. Sale date had a small positive effect (0.0033% per day), reflecting general price appreciation over time. Houses missing secondary area data were associated with a 3.56% lower sale price.

Although the effect of operating cost was small (0.00013% per SEK) and only marginally significant, it was included in the final model due to theoretical relevance.

Random effects captured variation between neighborhoods. The intraclass correlation coefficient (ICC) was high, indicating that approximately 74% of the variance in house prices could be attributed to differences between neighborhoods. This supports the inclusion of a random intercept for neighborhood, accounting for clustered structure in the data.

Note: Due to the differences in scale among predictor variables (such as living area, number of rooms, etc.), I’ve used centered and scaled values for all predictors. This was necessary for model convergence. Additionally, we are using the log-transformed house price as the response variable to ensure a more normal distribution and to facilitate interpretability.

# Standardize all continuous predictors
houses_train_scaled_all <- houses_train_cleaned |>
  mutate(across(c(living_area, number_of_rooms, operating_cost, plot_area, secondary_area, year_built), scale))

# Re-fit the model with standardized predictors
mixed_model_standardized <- lmer(log(house_price) ~ living_area + number_of_rooms + operating_cost + plot_area + year_built + year +  secondary_area + secondary_area_missing + (1 | neighborhood), 
  data = houses_train_scaled_all)

Standardized Coefficient Plot for Fixed Effects

# Get the coefficients and confidence intervals from the model
fixed_eff_standardized <- broom.mixed::tidy(mixed_model_standardized, effects = "fixed", conf.int = TRUE)

# Exponentiate the coefficients and confidence intervals
fixed_eff_exp_standardized <- fixed_eff_standardized %>%
  mutate(across(c(estimate, conf.low, conf.high), exp))

# Convert exponentiated coefficients to percentage change
fixed_eff_exp_standardized$percent_change <- (fixed_eff_exp_standardized$estimate - 1) * 100

# Coefficient plot 
ggplot(fixed_eff_exp_standardized, aes(x = estimate, y = term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
  scale_x_continuous(trans = 'log10', limits = c(0.9, 1.5)) +
  labs(title = "Impact of Predictors on House Price (Log-Scale)",
       subtitle = "Exponentiated fixed effects with 95% confidence intervals",
       x = "Effect on House Price for One Standard Deviation Change",
       y = "Predictors") +
  bengtegard_theme() +
  geom_text(aes(label = paste0(round(percent_change, 1), "%")), 
            hjust = 0.3, vjust = -2, size = 4, color = "#ff6347")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_errorbarh()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

Plotting Random Effect (Neighborhood)

# Plot random effects from your mixed model
plot_model(mixed_model_clean, type = "re", show.values = FALSE, show.p = FALSE) +
  labs(title = "Random Effects: Neighborhood Intercepts", 
       x = "Random Effect Estimate", 
       y = "Neighborhood") +
  bengtegard_theme()

Table of Model summary with 95% CI for all predictors

tab_model(mixed_model_standardized,
          show.re.var = TRUE,              
          pred.labels = c( "(Intercept)", "Living area (m²)", "Number of rooms", "Operating cost (SEK)", "Plot area (m²)", "Year built", 
          "Sale Year", "Secondary area missing (Yes/No)"),
          dv.labels = "Impact of Predictors on House Sale Price",            
          show.p = TRUE,                      
          p.style = "numeric",                
          digits = 3,                        
          string.ci = "95% CI",               
          transform = NULL)                   
  Impact of Predictors on House Sale Price
Predictors Estimates 95% CI p
(Intercept) -7.567 -19.756 – 4.623 0.224
living_area 0.119 0.103 – 0.136 <0.001
number_of_rooms 0.029 0.014 – 0.044 <0.001
operating_cost 0.011 -0.000 – 0.022 0.056
plot_area 0.059 0.046 – 0.072 <0.001
year_built 0.065 0.053 – 0.077 <0.001
year 0.011 0.005 – 0.017 <0.001
secondary_area 0.019 0.006 – 0.033 0.005
secondary_area_missing -0.016 -0.041 – 0.010 0.238
Random Effects
σ2 0.03
τ00 neighborhood 0.09
ICC 0.74
N neighborhood 84
Observations 1404
Marginal R2 / Conditional R2 0.236 / 0.804
exp_fixef <- exp(fixef(mixed_model_clean))
randef <- ranef(mixed_model_clean)

# Exponentiate the random effects
exp_randef <- exp(randef[[1]])

# View the exponentiated fixed + random effects
print(exp_fixef)
##            (Intercept)            living_area        number_of_rooms 
##         0.000005894837         1.002864622842         1.022352421601 
##         operating_cost              plot_area             year_built 
##         1.000000874253         1.000175614562         1.001930525709 
##                   year         secondary_area secondary_area_missing 
##         1.011469422481         1.000444579224         0.984581157291
print(exp_randef)
##                 (Intercept)
## Almhög            0.9892941
## Almvik            0.8773378
## Ängar             1.1684436
## Annetorp          1.2628771
## Bellevue          1.2102296
## Bellevuesjösida   1.5744463
## Bergdala          0.9408789
## Bulltofta         0.7925177
## Bunkeflo          1.3167191
## Bunkeflostrand    1.1147922
## City              1.7265479
## Dammfri           0.9087925
## Djupadal          1.1485945
## Ekoby             0.8267888
## Elinegård         1.1157282
## Ellenborg         0.8189886
## Eriksfält         0.9342160
## Fårabäck          0.6824235
## Fosie             0.6180031
## Fridhem           1.5630069
## Glostorp          0.6879326
## Gröndal           1.1431049
## Gullvik           0.8188438
## Gullviksborg      0.8162554
## Håkanstorp        1.2166558
## Hamnen            1.9742215
## Hemgården         0.8497665
## Hindby            0.8491177
## Höja              0.9378576
## Husie             0.9292166
## Hyllie            1.0952191
## Hyllieby          1.1234752
## Jägersro          0.7840213
## Johanneslust      1.2440830
## Käglinge          0.8338934
## Kastanjegården    0.7713732
## Kattarp           0.8347789
## Kirseberg         1.1195921
## Klagshamn         1.1174680
## Klagstorp         0.9803140
## Kristineberg      0.8732435
## Kroksbäck         0.9972611
## Kulladal          0.9955292
## Kungshälla        0.8723921
## Kvarnby           1.0017838
## Kvissle           0.5741698
## Kyrkby            0.7616164
## Limhamn           1.4674224
## Lindeborg         0.8938489
## Lockarp           0.8802164
## Mellanheden       1.6631092
## Nydala            0.9762126
## Ön                1.3906989
## Oxie              0.7459791
## Riseberga         0.9228824
## Rosengård         0.8494891
## Rosenvång         1.5093132
## Rostorp           1.2856661
## Sallerup          0.6504962
## Segevång          0.7855289
## Sibbarp           1.3990929
## Sjösida           1.8190184
## Sjöstad           1.3796774
## Skrävlinge        0.9799272
## Skumparp          1.0278154
## Söderkulla        0.9015053
## Sofielund         1.1934374
## Solbacken         1.4411833
## Stenkällan        0.8336516
## Strandhem         0.9981171
## Toarp             0.6615856
## Toftanäs          0.6684293
## Toftängen         0.8181040
## Tullstorp         0.5400964
## Tygelsjö          0.9724329
## Ulricedal         0.8295291
## Valdemarsro       0.9187385
## Vårsången         0.7133796
## Väster            2.2299427
## Västervång        1.5422363
## Videdal           0.9080732
## Villastad         0.8791035
## Vintrie           1.1834719
## Virentofta        0.9276889

Prediction intervals

predict_prices_with_random_effects <- function(house_df, model, group_var = "neighborhood") {
  # Fixed effects
  exp_fixef <- exp(fixef(model))
  
  # Random intercepts
  ranef_vals <- ranef(model)[[group_var]]
  ranef_vals$group <- rownames(ranef_vals)
  
  # Join random effects to house data
  house_df$group <- as.character(house_df[[group_var]])
  house_df <- merge(house_df, ranef_vals, by = "group", all.x = TRUE)
  
  # Handle missing neighborhoods (i.e., new levels)
  house_df$`(Intercept)`[is.na(house_df$`(Intercept)`)] <- 0
  
  # Calculate exponentiated intercepts
  house_df$rand_intercept_exp <- exp(house_df$`(Intercept)`)
  intercept_exp <- exp_fixef["(Intercept)"]
  
  # Predict prices
  house_df$predicted_price <- intercept_exp * house_df$rand_intercept_exp *
    exp_fixef["living_area"]^house_df$living_area *
    exp_fixef["number_of_rooms"]^house_df$number_of_rooms *
    exp_fixef["operating_cost"]^house_df$operating_cost *
    exp_fixef["plot_area"]^house_df$plot_area *
    exp_fixef["year_built"]^house_df$year_built *
    exp_fixef["year"]^house_df$year *
    exp_fixef["secondary_area"]^house_df$secondary_area *
    exp_fixef["secondary_area_missing"]^house_df$secondary_area_missing
  
  return(house_df[, c(group_var, "predicted_price")])
}

# Example prediction
example_houses <- data.frame(
  neighborhood = c("Fosie", "Limhamn"),
  living_area = c(130, 100),
  number_of_rooms = c(5, 3),
  operating_cost = c(5000, 10000),
  plot_area = c(100, 250),
  year_built = c(2005, 1993),
  year = c(2019, 2021),
  secondary_area = c(30, 0),
  secondary_area_missing = c(0, 1)
)

preds <- predict_prices_with_random_effects(example_houses, mixed_model_clean)
knitr::kable(preds, caption = "Predicted House Prices with Random Effects")
Predicted House Prices with Random Effects
neighborhood predicted_price
Fosie 2919480
Limhamn 6096095